In [1]:
from sentence_transformers import SentenceTransformer
# import pyreadr
import os
import pandas as pd
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
import textwrap
import itables
from sklearn.cluster import AgglomerativeClustering
itables.init_notebook_mode(all_interactive=True, connected=False)
Semantic Similarity Across Theoretical Constructs¶
Data Transformation¶
In [2]:
# Activate the automatic conversion of R objects to pandas objects
pandas2ri.activate()
# Expand the user path if needed
rdata_path = os.path.expanduser("~/Documents/Coding/SAA_OA/python_interfacing_ver.RData")
# Load the file using the R function load()
robjects.r['load'](rdata_path)
# List all objects currently in the R global environment
loaded_objects = robjects.r.ls()
# print("Loaded R objects:", list(loaded_objects))
# Replace 'my_data' with the actual name of the object loaded from the RData file
r_constructs = robjects.r['constructs']
# Convert to a pandas DataFrame
constructs = pandas2ri.rpy2py(r_constructs)
# Now you can work with the DataFrame in Python
print(constructs.head())
print(constructs['description'])
project description narrative_count \ 1 OA HP originates in parental representations and ... 30.0 2 OA HP is an attachment figure 32.0 3 OA The quality of one’s HP relationship influence... 24.0 4 OA Construction of a reliable HP is a pathway for... 35.0 5 SAA Spirituality reshapes behavioral schema 29.0 RT_count hierarchy TC 1 56.0 TC1 1 2 83.0 TC2 2 3 47.0 TC3 3 4 86.0 TC4 4 5 73.0 TC1 1 1 HP originates in parental representations and ... 2 HP is an attachment figure 3 The quality of one’s HP relationship influence... 4 Construction of a reliable HP is a pathway for... 5 Spirituality reshapes behavioral schema 6 Identity integrated with spirituality influenc... 7 HP is an attachment figure 8 Spirituality shapes social identity Name: description, dtype: object
Load a pretrained Model
In [3]:
# 1. Load a pretrained Sentence Transformer model
# model = SentenceTransformer("all-MiniLM-L6-v2") 5 times fasters
model = SentenceTransformer("all-mpnet-base-v2")
In [4]:
constructs = constructs.rename(columns={"project": "program"})
# Add a new column 'graph_label' combining 'description' and 'project' in the format of (<program>)
constructs['graph_label'] = constructs['description'] + " (" + constructs['program'] + ")"
print(constructs[['description', 'graph_label']])
description \
1 HP originates in parental representations and ...
2 HP is an attachment figure
3 The quality of one’s HP relationship influence...
4 Construction of a reliable HP is a pathway for...
5 Spirituality reshapes behavioral schema
6 Identity integrated with spirituality influenc...
7 HP is an attachment figure
8 Spirituality shapes social identity
graph_label
1 HP originates in parental representations and ...
2 HP is an attachment figure (OA)
3 The quality of one’s HP relationship influence...
4 Construction of a reliable HP is a pathway for...
5 Spirituality reshapes behavioral schema (SAA)
6 Identity integrated with spirituality influenc...
7 HP is an attachment figure (SAA)
8 Spirituality shapes social identity (SAA)
Performing Semantic Similarity Analysis¶
In [5]:
# Assuming 'constructs["description"]' is your list of sentences
sentences = constructs['description']
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]
# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
# [0.6660, 1.0000, 0.1411],
# [0.1046, 0.1411, 1.0000]])
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
(8, 768)
tensor([[1.0000, 0.4711, 0.2740, 0.2876, 0.2060, 0.1137, 0.4711, 0.0822],
[0.4711, 1.0000, 0.3047, 0.2424, 0.0912, 0.1205, 1.0000, 0.0803],
[0.2740, 0.3047, 1.0000, 0.7878, 0.3226, 0.4944, 0.3047, 0.2786],
[0.2876, 0.2424, 0.7878, 1.0000, 0.2760, 0.3242, 0.2424, 0.1612],
[0.2060, 0.0912, 0.3226, 0.2760, 1.0000, 0.5964, 0.0912, 0.6619],
[0.1137, 0.1205, 0.4944, 0.3242, 0.5964, 1.0000, 0.1205, 0.6947],
[0.4711, 1.0000, 0.3047, 0.2424, 0.0912, 0.1205, 1.0000, 0.0803],
[0.0822, 0.0803, 0.2786, 0.1612, 0.6619, 0.6947, 0.0803, 1.0000]])
In [6]:
# If 'similarities' is a PyTorch tensor, convert it to a NumPy array first:
similarity_matrix = similarities.detach().cpu().numpy()
# Create a DataFrame with sentences as both index and columns:
labels = constructs['graph_label']
similarity_df = pd.DataFrame(similarity_matrix, index=labels, columns=labels)
itables.show(similarity_df, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
| graph_label | HP originates in parental representations and can be modified (OA) | HP is an attachment figure (OA) | The quality of one’s HP relationship influences food addiction recovery (OA) | Construction of a reliable HP is a pathway for food addiction recovery (OA) | Spirituality reshapes behavioral schema (SAA) | Identity integrated with spirituality influences sex addiction (SAA) | HP is an attachment figure (SAA) | Spirituality shapes social identity (SAA) |
|---|---|---|---|---|---|---|---|---|
| graph_label | ||||||||
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Visualization¶
In [7]:
%config InlineBackend.figure_format = 'svg'
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap
import numpy as np
# Mask the upper triangle and diagonal of the heatmap
mask = np.triu(np.ones_like(similarity_df, dtype=bool))
# Create the figure and heatmap, and get the current Axes object
plt.figure(figsize=(10, 10))
ax = sns.heatmap(similarity_df,mask=mask, annot=True, fmt=".2f", cmap="Blues",
cbar=True, linewidths=0.5)
plt.title("Semantic Similarity of Theoretical Constructs")
# Remove the axis titles
ax.set_xlabel('')
ax.set_ylabel('')
# Wrap and rotate x-axis labels:
xlabels = ax.get_xticklabels()
wrapped_xlabels = [textwrap.fill(label.get_text(), width=40) for label in xlabels]
ax.set_xticklabels(wrapped_xlabels, rotation=45, ha='right') # Rotate x labels
# For y-axis: Wrap the text with a width of 25.
ylabels = ax.get_yticklabels()
wrapped_ylabels = [textwrap.fill(label.get_text(), width=25) for label in ylabels]
ax.set_yticklabels(wrapped_ylabels, rotation=0) # Keep y labels horizontal
plt.tight_layout() # Adjust layout to fit labels properly
plt.show()
Key Similarities between Theoretical Constructs¶
- (2,7) = 1.00
- "HP is an attachment figure (OA)" vs. "HP is an attachment figure (SAA)"
- Essentially identical, indicating both programs use the same language and meaning around HP-as-attachment.
- (3,4) = 0.82
- "The quality of one’s HP relationship influences food addiction recovery (OA)" vs. "Construction of a reliable HP is a pathway for food addiction recovery (OA)"
- Indicates strong conceptual overlap; both constructs center on HP’s trustworthiness and its role in supporting recovery.
- (5,8) = 0.71
- "Spirituality reshapes behavioral schema (SAA)" vs. "Spirituality shapes social identity (SAA)"
- Shows a close connection between reconfiguring behaviors and reshaping social identity through spirituality in SAA.
- (5,6) = 0.56
- "Spirituality reshapes behavioral schema (SAA)" vs. "Identity integrated with spirituality influences sex addiction (SAA)"
- Reflects moderate alignment, indicating that spiritual transformation of behavior and its integration into identity are closely related in SAA.
- (1,2) = 0.54 and (1,7) = 0.54
- (1,2): "HP originates in parental representations and can be modified (OA)" vs. "HP is an attachment figure (OA)"
- (1,7): "HP originates in parental representations and can be modified (OA)" vs. "HP is an attachment figure (SAA)"
- These values indicate a moderate similarity, suggesting that understanding HP as influenced by parental models moderately aligns with viewing HP as an attachment figure.
- (3,6) = 0.35
- "The quality of one’s HP relationship influences food addiction recovery (OA)" vs. "Identity integrated with spirituality influences sex addiction (SAA)"
- Shows some conceptual resonance between HP relational quality and spiritual identity integration, though less pronounced than within-program comparisons.
Lower Similarities¶
- Many cross-program comparisons fall in the 0.05–0.30 range, reflecting differing emphases between OA (focused on developmental/attachment aspects) and SAA (focused on identity and behavior change).
- Some pairs show near-zero similarity (e.g., around 0.02–0.03), highlighting fundamental differences where the concepts diverge strongly.
Overall Observations¶
- Within-program constructs share the highest similarities.
- In OA, the relational quality and reliable construction of HP (0.82) are closely linked.
- In SAA, constructs related to behavioral schema and social identity (0.71 and 0.56) show a strong connection.
- The exact phrasing of "HP is an attachment figure" in OA and SAA (2,7) aligns perfectly (1.00), indicating shared conceptualization despite different program contexts.
- Cross-program comparisons generally show lower similarity scores, reflecting that OA’s focus on developmental and attachment models differs from SAA’s emphasis on identity transformation through spirituality.
Semantic Similarity Across Relevant Texts under Theoretical Constructs¶
Rationale is to check the similarity of the relevant texts of 2 similar or related theoretical constructs and themes (esp. across programs)
Data Cleaning¶
In [8]:
# Load Relevant Text Data
# Replace 'my_data' with the actual name of the object loaded from the RData file
r_RT_data = robjects.r['RT_data']
# Convert to a pandas DataFrame
RT_data = pandas2ri.rpy2py(r_RT_data)
# Extract TC and Theme with Orphan handling
# Extract the numeric part following 'TC' for the TC column
RT_data['TC'] = RT_data['code'].apply(
lambda x: int(x.split('>')[1].strip()[2:]) if 'Orphan' not in x else np.nan
)
# Extract the numeric part following 'T' for the theme column
RT_data['theme'] = RT_data['code'].apply(
lambda x: int(x.split('>')[2].strip()[1:]) if 'Orphan' not in x else np.nan
)
RT_data['ID'] = range(1, len(RT_data)+1)
RT_SAA = RT_data[RT_data['program'] == 'SAA'].reset_index(drop=True)
RT_OA = RT_data[RT_data['program'] == 'OA'].reset_index(drop=True)
# Now you can work with the DataFrame in Python
itables.show(RT_data.head(), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
| code | RT | characters | Beginning | End | narrative_num | RI | program | coverage | TC | theme | ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
In [9]:
# Load Hierarchy Data
# Replace 'my_data' with the actual name of the object loaded from the RData file
r_hierarchy = robjects.r['hierarchy']
# Convert to a pandas DataFrame
hierarchy = pandas2ri.rpy2py(r_hierarchy)
# Now you can work with the DataFrame in Python
hierarchy.head()
hierarchy = hierarchy.applymap(lambda x: None if isinstance(x, (int, float)) and x < 0 else x)
# Replace NaN values with 0 before converting to integers
hierarchy = hierarchy.fillna(0)
hierarchy = hierarchy.apply(lambda col: col.astype(int) if col.dtypes == 'float64' else col)
# Replace 0 values back to NaN after converting to integers
hierarchy = hierarchy.replace(0, np.nan)
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/677195587.py:12: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead. hierarchy = hierarchy.applymap(lambda x: None if isinstance(x, (int, float)) and x < 0 else x)
Semantic Similarities in Relevant Texts under the Theoretical Construct "HP is an attachment figure" in both OA and SAA¶
In Hierarchy, search for theoretical constructs "HP is an attachment figure" in both programs, find the TC value and get all the RTs under the TCs
In [10]:
# Filter RT_data based on hierarchy and search terms
def filter_rt_by_category(hierarchy_df, rt_data_df, search_list, category):
"""
Filters the RT_data DataFrame based on search terms found in the hierarchy DataFrame.
Parameters:
- hierarchy_df: DataFrame containing hierarchy information.
- rt_data_df: DataFrame containing the RT (relevant texts) data.
- search_list: List of strings to search for in the 'description' column of hierarchy_df.
- category: A string indicating which category to use. Allowed values are "TC", "theme", or "RI".
Returns:
- A DataFrame containing the filtered RT_data rows.
"""
filtered_hierarchy = hierarchy_df[hierarchy_df['description'].isin(search_list)]
filtered_rt_data = pd.DataFrame()
for _, row in filtered_hierarchy.iterrows():
program = row['program']
if category == "TC":
# Get the TC value and construct the code text
code_fragment = f"TC{int(row['TC'])}"
elif category == "theme":
# Assumes a column 'theme' exists with numeric values
code_fragment = f"T{int(row['theme'])}"
elif category == "RI":
# Assumes a column 'RI' exists with numeric values
code_fragment = f"RI{int(row['RI'])}"
else:
raise ValueError("Category must be one of: 'TC', 'theme', or 'RI'.")
# Filter RT_data for matching code and program
filtered_rt = rt_data_df[
(rt_data_df['code'].str.contains(code_fragment)) &
(rt_data_df['program'] == program)
]
filtered_rt_data = pd.concat([filtered_rt_data, filtered_rt])
# Add a new column 'label' combining 'description' and 'project' in the format of (<program>)
filtered_rt_data['label'] = filtered_rt_data['RT'] + " (" + filtered_rt_data['program'] + ")"
# print(filtered_rt_data[['RT', 'label']])
return filtered_rt_data
# Example usage:
# search_terms = ["HP is an attachment figure"]
# rt_data_filtered = filter_rt_by_category(hierarchy, RT_data, search_terms, "TC")
# print(rt_data_filtered)
# Filter RT_data for TC values
search_terms = ["HP is an attachment figure"]
rt_data_filtered = filter_rt_by_category(hierarchy, RT_data, search_terms, "TC")
# Display the filtered RTs
print(rt_data_filtered)
code RT \
321 OA > TC2 > T3 > RI26 Then without my control, without me doing anyt...
322 OA > TC2 > T3 > RI26 This time was different. My mind had nothing t...
323 OA > TC2 > T8 > RI8 My Higher Power sent me the message: I was as ...
324 OA > TC2 > T3 > RI26 I’m very grateful that God has removed the obs...
325 OA > TC2 > T3 > RI37 It took me so long to pick up the set of spiri...
.. ... ...
281 SAA > TC3 > T8 > RI38 or the first three steps over and over, like a...
285 SAA > TC3 > T8 > RI38 It became more and more important to admit the...
295 SAA > TC3 > T6 > RI35 Another gift of the program is the emphasis on...
297 SAA > TC3 > T9 > RI33 I had a sense that the group was right for me....
316 SAA > TC3 > T8 > RI29 Since the day I admitted my powerlessness and ...
characters Beginning End narrative_num RI program coverage TC \
321 189.0 10.0 10.0 7 26 OA 0.037052 2.0
322 70.0 10.0 10.0 7 26 OA 0.013723 2.0
323 86.0 11.0 11.0 21 8 OA 0.015297 2.0
324 53.0 20.0 20.0 21 26 OA 0.009427 2.0
325 114.0 21.0 21.0 21 37 OA 0.020278 2.0
.. ... ... ... ... .. ... ... ...
281 54.0 7.0 7.0 41 38 SAA 0.004676 3.0
285 173.0 13.0 13.0 41 38 SAA 0.014980 3.0
295 300.0 16.0 16.0 6 35 SAA 0.036670 3.0
297 337.0 9.0 9.0 3 33 SAA 0.044671 3.0
316 146.0 5.0 5.0 17 29 SAA 0.037932 3.0
theme ID label
321 3.0 321 Then without my control, without me doing anyt...
322 3.0 322 This time was different. My mind had nothing t...
323 8.0 323 My Higher Power sent me the message: I was as ...
324 3.0 324 I’m very grateful that God has removed the obs...
325 3.0 325 It took me so long to pick up the set of spiri...
.. ... ... ...
281 8.0 281 or the first three steps over and over, like a...
285 8.0 285 It became more and more important to admit the...
295 6.0 295 Another gift of the program is the emphasis on...
297 9.0 297 I had a sense that the group was right for me....
316 8.0 316 Since the day I admitted my powerlessness and ...
[154 rows x 13 columns]
Perform Semantic Similarity Analysis
In [11]:
def compute_similarities(sentences, model):
"""
Compute embeddings for a series of sentences and return their similarity matrix.
Parameters:
- sentences: A list or pandas Series of text strings.
- model: A pretrained SentenceTransformer model.
Returns:
- embeddings: A numpy array of sentence embeddings.
- similarities: A torch Tensor containing the similarity scores.
"""
embeddings = model.encode(sentences)
# print("Embeddings shape:", embeddings.shape) # For verification
similarities = model.similarity(embeddings, embeddings)
return embeddings, similarities
# Compute the similarities for the filtered RT data
embeddings, similarities = compute_similarities(rt_data_filtered['RT'], model)
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
In [12]:
def create_similarity_df(similarities, label_series):
"""
Convert a similarity tensor/array to a DataFrame with given labels as index and columns.
Parameters:
- similarities: a torch.Tensor or a NumPy array of similarity values.
- label_series: a list-like object (e.g., pandas Series) containing labels.
Returns:
- A pandas DataFrame with similarity values and labels as both index and columns.
"""
# Convert to a numpy array if similarities is a torch.Tensor
if hasattr(similarities, "detach"):
sim_matrix = similarities.detach().cpu().numpy()
else:
sim_matrix = similarities
similarity_temp = pd.DataFrame(sim_matrix, index=label_series, columns=label_series) # still in matrix like dataframe
return matrix_to_df(similarity_temp)
def matrix_to_df(sim_df): # sub function of the function above
# Create a mask for the upper triangle (excluding the diagonal)
mask = np.triu_indices_from(sim_df, k=1)
# Use the mask to get sentence pairs and their corresponding similarity values
top_pairs = pd.DataFrame({
'Sentence A': sim_df.index[mask[0]],
'Sentence B': sim_df.columns[mask[1]],
'Similarity Score': sim_df.values[mask]
})
return top_pairs
# Create the similarity DataFrame using the function
sim_df = create_similarity_df(similarities, rt_data_filtered['label'])
itables.show(sim_df, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
| Sentence A | Sentence B | Similarity Score |
|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
In [13]:
# Filter the DataFrame to keep only the OA-SAA pairs
# and remove duplicates
def process_oa_saa_pairs(df):
"""
Process a DataFrame to filter, deduplicate, and standardize OA-SAA sentence pairs.
Args:
df (pd.DataFrame): Input DataFrame with 'Sentence A' and 'Sentence B' columns.
Returns:
pd.DataFrame: Processed DataFrame with 'OA' and 'SAA' columns.
"""
# Create ordered pairs for deduplication
df['Ordered Pair'] = df.apply(lambda row: tuple(sorted([row['Sentence A'], row['Sentence B']])), axis=1)
# Filter for OA-SAA pairs and deduplicate
mask = ((df['Sentence A'].str.contains('(OA)', regex=False) & df['Sentence B'].str.contains('(SAA)', regex=False)) |
(df['Sentence A'].str.contains('(SAA)', regex=False) & df['Sentence B'].str.contains('(OA)', regex=False)))
df = df[mask].drop_duplicates(subset='Ordered Pair').reset_index(drop=True)
# Ensure OA sentences are in the first column
swap_mask = ~df['Sentence A'].str.contains('(OA)', regex=False)
df.loc[swap_mask, ['Sentence A', 'Sentence B']] = df.loc[swap_mask, ['Sentence B', 'Sentence A']].values
# Rename columns and drop 'Ordered Pair'
df = df.rename(columns={'Sentence A': 'OA', 'Sentence B': 'SAA'}).drop(columns='Ordered Pair')
return df
def run_rt_similarity_analysis(hierarchy_df, rt_data_df, search_list, category, model): # Master function
rt_data_filtered = filter_rt_by_category(hierarchy_df, rt_data_df, search_list, category)
embeddings, similarities = compute_similarities(rt_data_filtered['RT'], model)
sim_df = create_similarity_df(similarities, rt_data_filtered['label'])
oa_saa_pairs = process_oa_saa_pairs(sim_df).sort_values('Similarity Score', ascending=False)
return oa_saa_pairs
search_terms = ["HP is an attachment figure"]
# Call the function using the previously defined search_terms
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Semantic Similarities in Relevant Texts under the Theoretical Constructs about relational quality (OA) and social identity reconstruction (SAA)¶
- The quality of one’s HP relationship influences food addiction recovery (OA)
- Spirituality shapes social identity (SAA)
In [14]:
# Filter RT_data for TC values
search_terms = ["The quality of one’s HP relationship influences food addiction recovery",
"Spirituality shapes social identity"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Semantic Similarities in Relevant Texts under the Theoretical Constructs about HP reliability (OA) and reshaping behavioral schema (SAA)¶
- Construction of a reliable HP is a pathway for food addiction recovery (OA)
- Spirituality reshapes behavioral schema (SAA)
In [15]:
# Filter RT_data for TC values
search_terms = ["Construction of a reliable HP is a pathway for food addiction recovery",
"Spirituality reshapes behavioral schema"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Semantic Similarities/Differences in Relevant Texts under the Theoretical Constructs that are different¶
- HP originates in parental representations and can be modified (OA)
- Identity integrated with spirituality influences sex addiction (SAA)
In [16]:
# Filter RT_data for TC values
search_terms = ["HP originates in parental representations and can be modified",
"Identity integrated with spirituality influences sex addiction"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)
# Display the processed DataFrame
# oa_saa_pairs.head(20)
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Semantic Similarity Across Relevant Texts under Themes¶
Surrender as Entry Point¶
- OA-TC2-T10: Surrender, acceptance, and trust in HP lead to recovery.
- SAA-TC3-T9: Key steps toward admitting powerlessness and surrendering.
In [17]:
# Filter RT_data for TC values
search_terms = ["Surrender, acceptance, and trust in HP lead to recovery",
"Key steps toward admitting powerlessness and surrendering"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Higher Power as Behavioral Regulator¶
| OA | SAA |
|---|---|
| TC2-T3 ("HP acts as an agent of support"):- RI16: Shifting dependence to HP cures compulsions- RI37: HP provides power to recover | TC1-T2 ("Spirituality helps me abstain from sexual behaviors."):- RI18: HP removes triggers/harmful environments- RI17: Mental influences for abstinence |
In [18]:
# Filter RT_data for TC values
search_terms = ["HP acts as an agent of support",
"Spirituality helps me abstain from sexual behaviors."]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Identity Reformation¶
OA-TC3-T11 ("Having a HP stabilizes one’s inner world"):
- RI19: “Spiritual Awakening” changes behaviors and identity
- RI27: Recovery starts with understanding self-worth
SAA-TC2-T4 ("Spirituality transformed parts of my identity to resolve my sex addiction."):
- RI41: Spiritual and sexual identity integration
- RI22: Guidance from God/HP
In [19]:
# Filter RT_data for TC values
search_terms = ["Having a HP stabilizes one’s inner world",
"Spirituality transformed parts of my identity to resolve my sex addiction."]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Spirituality can have negative effects if misunderstood or misapplied.¶
- SAA: Spirituality is more harmful than good (5.2)
- OA: An uninformed HP can be a negative influence on recovery (2)
In [20]:
# Filter RT_data for TC values
search_terms = ["Spirituality is more harmful than good in my life.",
"An uninformed HP can be a negative influence on one’s recovery"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Spirituality as Social Support¶
- HP is cocreated from meaningful relationships and experiences (OA)
- Bidirectional influence of social support and spirituality (social support ↔ Spirituality) (SAA)
In [21]:
# Filter RT_data for TC values
search_terms = ["HP is cocreated from meaningful relationships and experiences",
"Bidirectional influence of social support and spirituality (social support ↔ Spirituality)"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Comparison 2:
- HP acts as an agent of support (OA)
- Bidirectional influence of social support and spirituality (social support ↔ Spirituality) (SAA)
In [35]:
# Filter RT_data for TC values
search_terms = ["HP acts as an agent of support",
"Bidirectional influence of social support and spirituality (social support ↔ Spirituality)"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
The comfort and companionship provided by spirituality/HP¶
- HP fulfills one’s need for comfort and acceptance (OA)
- Thanks to spirituality, I do not have to fight sex addiction alone. (SAA)
The similarity score of sentences is lower than most other pairs.
In [34]:
# Filter RT_data for TC values
search_terms = ["HP fulfills one’s need for comfort and acceptance",
"Thanks to spirituality, I do not have to fight sex addiction alone."]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
| OA | SAA | Similarity Score | |
|---|---|---|---|
Loading ITables v2.2.5 from the init_notebook_mode cell...
(need help?) |
Hierarchical Clustering (Forced Grounded Theory Analysis)¶
Semi-Supervised process
45, 15, 5 for the number of RI, themes, and TCs
In [22]:
# Calculate the embeddings for the OA and SAA sentences
OA_embeddings = model.encode(RT_OA['RT'])
SAA_embeddings = model.encode(RT_SAA['RT'])
# Check the shape of the embeddings (for debugging)
SAA (doesn't make a lot of sense)¶
Level 1 (Repeating Ideas)¶
In [23]:
# Set a distance threshold to control cluster granularity.
# You may need to adjust the threshold based on your data characteristics.
# distance_threshold = None # Set to None for automatic clustering
clustering_ri = AgglomerativeClustering(
n_clusters=47, # Set the number of clusters, 47 is currently the optimal number
distance_threshold=None, # only used if n_clusters is None
linkage='ward'
)
RI_labels = clustering_ri.fit_predict(SAA_embeddings)
RT_SAA['RI_cluster'] = RI_labels
# Validate clusters: remove clusters if the cluster has fewer than 2 texts
# or if texts come from the same narrative
valid_ri_clusters = []
for cluster in RT_SAA['RI_cluster'].unique():
cluster_data = RT_SAA[RT_SAA['RI_cluster'] == cluster]
if len(cluster_data) >= 2 and cluster_data['narrative_num'].nunique() >= 2:
valid_ri_clusters.append(cluster)
RT_SAA_valid = RT_SAA[RT_SAA['RI_cluster'].isin(valid_ri_clusters)]
print("Valid RI clusters:", valid_ri_clusters)
print("Valid number of RI clusters:", len(valid_ri_clusters)) # 40
print("Valid number of RTs categorized under RIs", len(RT_SAA_valid))
Valid RI clusters: [0, 23, 5, 18, 35, 34, 16, 25, 9, 4, 2, 1, 6, 10, 11, 20, 14, 38, 41, 37, 3, 22, 13, 27, 24, 21, 40, 19, 17, 33, 36, 29, 8, 31, 44, 15, 28, 43, 7, 45, 12, 42] Valid number of RI clusters: 42 Valid number of RTs categorized under RIs 308
In [24]:
# output to excel
output_path = os.path.expanduser("~/Documents/Coding/SAA_OA/data/RT_SAA_valid.xlsx")
RT_SAA_valid.to_excel(output_path, index=False)
print(f"Filtered RT_SAA_valid DataFrame saved to {output_path}")
Filtered RT_SAA_valid DataFrame saved to /Users/charlesli/Documents/Coding/SAA_OA/data/RT_SAA_valid.xlsx
Level 2 (Themes)¶
In [25]:
# For clustering themes, it's often useful to aggregate texts within each RI cluster.
# One simple method is to join RT texts per cluster and encode the aggregated text.
aggregated_texts = RT_SAA_valid.groupby('RI_cluster')['RT'].apply(lambda texts: " ".join(texts)).tolist()
# Get embeddings for aggregated texts (themes)
theme_embeddings = model.encode(aggregated_texts)
# Specify target number of themes (e.g., 15, but adjust if needed)
num_theme_clusters = None
clustering_theme = AgglomerativeClustering(
n_clusters=num_theme_clusters,
distance_threshold=1,
linkage='ward'
)
theme_labels = clustering_theme.fit_predict(theme_embeddings)
# Map theme labels back to RI clusters
ri_to_theme = dict(zip(RT_SAA_valid['RI_cluster'].unique(), theme_labels))
RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)
# --------------------------
# Step 2: Validate Themes (RIs to Themes)
# Only keep themes that have at least 2 unique RIs.
# --------------------------
# Count the unique RI clusters per theme in the valid RI clusters.
ri_counts_per_theme = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].nunique()
valid_theme_clusters = ri_counts_per_theme[ri_counts_per_theme >= 2].index
RT_SAA_valid_themes = RT_SAA_valid[RT_SAA_valid['theme_cluster'].isin(valid_theme_clusters)]
print("\nStep 2 - Valid themes (must have at least 2 unique RIs):")
print("Valid number of RTs categorized under themes", len(RT_SAA_valid_themes))
print("Valid number of RI clusters categorized under themes:", len(RT_SAA_valid_themes['RI_cluster'].unique())) # 40
print("Number of valid_themes: ", len(valid_theme_clusters))
Step 2 - Valid themes (must have at least 2 unique RIs): Valid number of RTs categorized under themes 256 Valid number of RI clusters categorized under themes: 34 Number of valid_themes: 9
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/1094038717.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)
Fallback Technique (Compromising)¶
In [26]:
from numpy.linalg import norm
underpopulated_themes = {theme: count for theme, count in ri_counts_per_theme.items() if count < 2}
# For fallback, compute an average embedding for each theme.
# First create a mapping from each RI_cluster (key) to its embedding.
unique_ri_clusters = RT_SAA_valid['RI_cluster'].unique()
ri_embedding_dict = dict(zip(unique_ri_clusters, theme_embeddings))
# Group RI clusters by theme_cluster
theme_assignment = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].unique().to_dict()
# Calculate an aggregated embedding for each theme as the mean of its RI embeddings.
theme_embedding_dict = {}
for theme, ri_list in theme_assignment.items():
emb_list = [ri_embedding_dict[ri] for ri in ri_list if ri in ri_embedding_dict]
if emb_list:
theme_embedding_dict[theme] = np.mean(emb_list, axis=0)
# Define a simple cosine distance function.
def cosine_distance(vec1, vec2):
return 1 - np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-10)
In [27]:
# Set a merging threshold for fallback (adjust as appropriate)
MERGE_THRESHOLD = 0.6
# For each underpopulated (invalid) theme, check if a nearby valid theme exists.
for under_theme in list(underpopulated_themes.keys()):
emb_under = theme_embedding_dict[under_theme]
best_match = None
best_distance = float("inf")
# Compare against all valid themes.
# all_themes = ri_counts_per_theme.index #(all theme clusters)
for valid_theme in valid_theme_clusters: #
emb_valid = theme_embedding_dict[valid_theme]
distance = cosine_distance(emb_under, emb_valid)
if distance < best_distance:
best_distance = distance
best_match = valid_theme
# If a sufficiently similar valid theme is found, reassign all RI clusters from the underpopulated theme.
if best_distance < MERGE_THRESHOLD and best_match is not None:
# Reassign RI clusters that were coded under the underpopulated theme to the best_match theme.
for key, val in ri_to_theme.items():
if val == under_theme:
ri_to_theme[key] = best_match
print(f"Merged underpopulated theme {under_theme} into theme {best_match} (distance={best_distance:.3f}).")
else:
print(f"No suitable merge found for underpopulated theme {under_theme}; it remains provisional.")
# Update the theme_cluster column using the updated mapping.
RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)
# --------------------------
# (C) Final Validation: Only keep themes with at least 2 unique RI clusters.
# --------------------------
ri_counts_per_theme = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].nunique()
final_valid_theme_clusters = ri_counts_per_theme[ri_counts_per_theme >= 2].index
RT_SAA_valid_themes = RT_SAA_valid[RT_SAA_valid['theme_cluster'].isin(final_valid_theme_clusters)]
# Print final validation results
print("\nStep 2 - Valid themes (must have at least 2 unique RIs):")
print("Valid number of RTs categorized under themes:", len(RT_SAA_valid_themes))
print("Valid number of RI clusters categorized under themes:", len(RT_SAA_valid_themes['RI_cluster'].unique()))
print("Number of valid themes:", len(final_valid_theme_clusters))
Merged underpopulated theme 8 into theme 0 (distance=0.495). No suitable merge found for underpopulated theme 9; it remains provisional. Merged underpopulated theme 11 into theme 1 (distance=0.450). Merged underpopulated theme 12 into theme 7 (distance=0.539). No suitable merge found for underpopulated theme 13; it remains provisional. Merged underpopulated theme 14 into theme 10 (distance=0.506). Merged underpopulated theme 15 into theme 3 (distance=0.386). Merged underpopulated theme 16 into theme 7 (distance=0.412). Step 2 - Valid themes (must have at least 2 unique RIs): Valid number of RTs categorized under themes: 297 Valid number of RI clusters categorized under themes: 40 Number of valid themes: 9
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/47481451.py:28: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)
If we merge between the unvalid themes
In [28]:
# Set a merging threshold for fallback (adjust as appropriate)
MERGE_THRESHOLD = 0.6
# For each underpopulated (invalid) theme, check if a nearby valid theme exists.
for under_theme in list(underpopulated_themes.keys()):
emb_under = theme_embedding_dict[under_theme]
best_match = None
best_distance = float("inf")
# Compare against all valid themes.
all_themes = ri_counts_per_theme.index #(all theme clusters)
for theme in all_themes: #
if theme != under_theme:
emb_all = theme_embedding_dict[theme]
distance = cosine_distance(emb_under, emb_all)
if distance < best_distance:
best_distance = distance
best_match = theme
# If a sufficiently similar valid theme is found, reassign all RI clusters from the underpopulated theme.
if best_distance < MERGE_THRESHOLD and best_match is not None:
# Reassign RI clusters that were coded under the underpopulated theme to the best_match theme.
for key, val in ri_to_theme.items():
if val == under_theme:
ri_to_theme[key] = best_match
print(f"Merged underpopulated theme {under_theme} into theme {best_match} (distance={best_distance:.3f}).")
else:
print(f"No suitable merge found for underpopulated theme {under_theme}; it remains provisional.")
# Update the theme_cluster column using the updated mapping.
RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)
# --------------------------
# (C) Final Validation: Only keep themes with at least 2 unique RI clusters.
# --------------------------
ri_counts_per_theme = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].nunique()
final_valid_theme_clusters = ri_counts_per_theme[ri_counts_per_theme >= 2].index
RT_SAA_valid_themes = RT_SAA_valid[RT_SAA_valid['theme_cluster'].isin(final_valid_theme_clusters)]
# Print final validation results
print("\nStep 2 - Valid themes (must have at least 2 unique RIs):")
print("Valid number of RTs categorized under themes:", len(RT_SAA_valid_themes))
print("Valid number of RI clusters categorized under themes:", len(RT_SAA_valid_themes['RI_cluster'].unique()))
print("Number of valid themes:", len(final_valid_theme_clusters))
Merged underpopulated theme 8 into theme 0 (distance=0.495). No suitable merge found for underpopulated theme 9; it remains provisional. Merged underpopulated theme 11 into theme 1 (distance=0.450). Merged underpopulated theme 12 into theme 7 (distance=0.539). No suitable merge found for underpopulated theme 13; it remains provisional. Merged underpopulated theme 14 into theme 10 (distance=0.506). Merged underpopulated theme 15 into theme 3 (distance=0.386). Merged underpopulated theme 16 into theme 7 (distance=0.412). Step 2 - Valid themes (must have at least 2 unique RIs): Valid number of RTs categorized under themes: 297 Valid number of RI clusters categorized under themes: 40 Number of valid themes: 9
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/4223298657.py:29: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)
Level 3 (Theoretical Constructs) (problematic)¶
In [29]:
# Aggregate RT texts within each valid theme cluster by joining texts from associated RIs.
aggregated_theme_texts = RT_SAA_valid_themes.groupby('theme_cluster')['RT'].apply(lambda texts: " ".join(texts)).tolist()
# Get embeddings for aggregated themes using your preloaded sentence transformer model.
tc_embeddings = model.encode(aggregated_theme_texts)
# Specify the target number of theoretical constructs, e.g., 5.
num_tc_clusters = 6
# Perform clustering on the aggregated theme embeddings.
clustering_tc = AgglomerativeClustering(
n_clusters=num_tc_clusters,
distance_threshold=None,
linkage='ward'
)
tc_labels = clustering_tc.fit_predict(tc_embeddings)
# Map the generated TC labels back to the associated theme clusters.
# First, get a sorted list of unique theme clusters from RT_SAA_valid_themes.
unique_themes = sorted(RT_SAA_valid_themes['theme_cluster'].unique())
# Build a mapping: each unique theme gets a corresponding TC label.
theme_to_tc = dict(zip(unique_themes, tc_labels))
# Create a new column for theoretical construct labels.
RT_SAA_valid_themes['TC_cluster'] = RT_SAA_valid_themes['theme_cluster'].map(theme_to_tc)
# Validate these TC clusters by counting unique themes within each TC.
tc_counts = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].nunique()
print("\nStep 3 - Preliminary TC clusters and their unique theme counts:")
print(tc_counts)
# Force each theoretical construct to have at least 2 unique themes.
# For any TC with less than 2 themes, apply a fallback strategy (here, we duplicate its theme list).
tc_groups = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].unique().to_dict()
final_tc_assignment = {}
for tc, themes in tc_groups.items():
if len(themes) < 2:
print(f"TC {tc} has only {len(themes)} unique theme(s). Forcing assignment.")
# Force the TC to meet the minimum requirement – here we duplicate the theme.
final_tc_assignment[tc] = list(themes) * 2
else:
final_tc_assignment[tc] = list(themes)
print("\nFinal TC assignments (each TC now forced to have at least 2 themes):")
print(final_tc_assignment)
Step 3 - Preliminary TC clusters and their unique theme counts:
TC_cluster
0 2
1 1
2 3
3 1
4 1
5 1
Name: theme_cluster, dtype: int64
TC 1 has only 1 unique theme(s). Forcing assignment.
TC 3 has only 1 unique theme(s). Forcing assignment.
TC 4 has only 1 unique theme(s). Forcing assignment.
TC 5 has only 1 unique theme(s). Forcing assignment.
Final TC assignments (each TC now forced to have at least 2 themes):
{0: [10, 0], 1: [7, 7], 2: [1, 3, 6], 3: [4, 4], 4: [2, 2], 5: [5, 5]}
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/2795416904.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy RT_SAA_valid_themes['TC_cluster'] = RT_SAA_valid_themes['theme_cluster'].map(theme_to_tc)
Fallback Technique (Compromising)¶
In [30]:
# Define cosine distance function
def cosine_distance(vec1, vec2):
return 1 - np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-10)
# ---------------------------
# Step 1: Precompute Pairwise Theme Distances
# ---------------------------
def compute_pairwise_distances(theme_ids, theme_embedding_dict):
pairwise_distances = {}
for i, th1 in enumerate(theme_ids):
for j in range(i + 1, len(theme_ids)):
th2 = theme_ids[j]
dist = cosine_distance(theme_embedding_dict[th1], theme_embedding_dict[th2])
pairwise_distances[(th1, th2)] = dist
pairwise_distances[(th2, th1)] = dist
return pairwise_distances
In [ ]:
# Create a DataFrame from the tc_embeddings numpy array using the ri_counts_per_theme.index as the index
# tc_embeddings_df = pd.DataFrame(tc_embeddings, index=ri_counts_per_theme.index)
# tc_embeddings_df
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[31], line 2 1 # Create a DataFrame from the tc_embeddings numpy array using the ri_counts_per_theme.index as the index ----> 2 tc_embeddings_df = pd.DataFrame(tc_embeddings, index=ri_counts_per_theme.index) 3 tc_embeddings_df File /opt/homebrew/lib/python3.10/site-packages/pandas/core/frame.py:827, in DataFrame.__init__(self, data, index, columns, dtype, copy) 816 mgr = dict_to_mgr( 817 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no 818 # attribute "name" (...) 824 copy=_copy, 825 ) 826 else: --> 827 mgr = ndarray_to_mgr( 828 data, 829 index, 830 columns, 831 dtype=dtype, 832 copy=copy, 833 typ=manager, 834 ) 836 # For data is list-like, or Iterable (will consume into list) 837 elif is_list_like(data): File /opt/homebrew/lib/python3.10/site-packages/pandas/core/internals/construction.py:336, in ndarray_to_mgr(values, index, columns, dtype, copy, typ) 331 # _prep_ndarraylike ensures that values.ndim == 2 at this point 332 index, columns = _get_axes( 333 values.shape[0], values.shape[1], index=index, columns=columns 334 ) --> 336 _check_values_indices_shape_match(values, index, columns) 338 if typ == "array": 339 if issubclass(values.dtype.type, str): File /opt/homebrew/lib/python3.10/site-packages/pandas/core/internals/construction.py:420, in _check_values_indices_shape_match(values, index, columns) 418 passed = values.shape 419 implied = (len(index), len(columns)) --> 420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") ValueError: Shape of passed values is (9, 768), indices imply (11, 768)
Fallback Technique¶
In [ ]:
pairwise_distances = compute_pairwise_distances(tc_embeddings_df.index, theme_embedding_dict)
In [ ]:
# Count unique themes per TC
initial_tc_counts = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].nunique()
# Identify underpopulated TCs (less than 3 themes)
underpopulated_tcs = initial_tc_counts[initial_tc_counts < 2].index.tolist()
populated_tcs = initial_tc_counts[initial_tc_counts >= 2].index.tolist()
redistribution_mapping = {}
# Iterate through underpopulated TCs
for under_tc in underpopulated_tcs:
# Get themes in the underpopulated TC
under_themes = RT_SAA_valid_themes[RT_SAA_valid_themes['TC_cluster'] == under_tc]['theme_cluster'].unique()
# Find the closest populated TC for each theme in the underpopulated TC
for theme in under_themes:
best_target_tc = None
best_distance = float('inf')
for populated_tc in populated_tcs:
target_themes = RT_SAA_valid_themes[RT_SAA_valid_themes['TC_cluster'] == populated_tc]['theme_cluster'].unique()
# Calculate average distance between the theme and all themes in the populated TC
distances = [pairwise_distances.get((theme, t), pairwise_distances.get((t, theme), float('inf'))) for t in target_themes]
avg_distance = np.mean(distances) if distances else float('inf')
if avg_distance < best_distance:
best_distance = avg_distance
best_target_tc = populated_tc
# Assign the theme to the best target TC
if best_target_tc is not None and best_distance <= MERGE_THRESHOLD:
redistribution_mapping[theme] = best_target_tc
# Apply redistribution mapping to update TC assignments
for theme, new_tc in redistribution_mapping.items():
RT_SAA_valid_themes.loc[RT_SAA_valid_themes['theme_cluster'] == theme, 'TC_cluster'] = new_tc
# ---------------------------
# Step 3: Verify Updated TC Distribution
# ---------------------------
updated_tc_counts = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].nunique()
print("\nUpdated TC distribution:")
print(updated_tc_counts)
# Finalize the updated DataFrame
RT_SAA_valid_TC = RT_SAA_valid_themes.copy()
print("\nFinal RT_SAA_valid_TC DataFrame:")
print(RT_SAA_valid_TC)
Updated TC distribution:
TC_cluster
0 4
1 5
2 3
Name: theme_cluster, dtype: int64
Final RT_SAA_valid_TC DataFrame:
code \
0 SAA > TC2 > T15 > RI6
1 SAA > Orphan RIs > RI26
2 SAA > Orphan RIs > RI26
3 SAA > TC4 > T10 > RI34
4 SAA > Orphan RIs > RI26
.. ...
313 SAA > TC4 > T12 > RI21
314 SAA > TC2 > T14 > RI23
315 SAA > TC3 > T8 > RI29
316 SAA > TC2 > T4 > RI49
317 SAA > TC4 > T11 > RI39
RT characters Beginning \
0 My religion told me homosexuality was wrong, s... 351.0 1.0
1 It was not “the next one” that saved me—it was... 247.0 13.0
2 After all, my partner was at church and would ... 368.0 7.0
3 A man entered the room and related that having... 353.0 29.0
4 The lightning jolts of terror I registered the... 697.0 3.0
.. ... ... ...
313 As I have maintained my commitment to recovery... 325.0 6.0
314 But my acting out was getting worse rather tha... 286.0 3.0
315 Since the day I admitted my powerlessness and ... 146.0 5.0
316 I hoped that by praying and working my program... 172.0 3.0
317 Eventually I met a woman and we began a commit... 204.0 7.0
End narrative_num RI program coverage TC theme ID RI_cluster \
0 1.0 4 6 SAA 0.039688 2.0 15.0 1 44
1 13.0 4 26 SAA 0.027929 NaN NaN 2 14
2 7.0 4 26 SAA 0.041610 NaN NaN 3 9
3 29.0 2 34 SAA 0.022808 4.0 10.0 4 2
4 4.0 2 26 SAA 0.045035 NaN NaN 5 25
.. ... ... .. ... ... ... ... ... ...
313 6.0 5 21 SAA 0.048843 4.0 12.0 314 3
314 3.0 17 23 SAA 0.074305 2.0 14.0 315 12
315 5.0 17 29 SAA 0.037932 3.0 8.0 316 19
316 3.0 17 49 SAA 0.044687 2.0 4.0 317 26
317 7.0 17 39 SAA 0.053001 4.0 11.0 318 3
theme_cluster TC_cluster
0 1 2
1 10 1
2 0 0
3 9 1
4 3 0
.. ... ...
313 19 1
314 3 0
315 0 0
316 19 1
317 19 1
[308 rows x 15 columns]
In [ ]:
print("Final Cluster Overview")
print(len(RT_SAA)) # total Relevant texts
print(len(RT_SAA_valid)) # valid RI clusters
print(len(RT_SAA_valid_themes)) # valid themes
print(len(RT_SAA_valid_TC)) # valid TCs
Final Cluster Overview 318 308 308 308